import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.dummy import DummyRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
plt.style.use("seaborn")
plt.rcParams['font.size'] = 16
This is an Airbnb price prediction project based on https://www.kaggle.com/kritikseth/us-airbnb-open-data
The main goal of our model is to predict the price of an AirBnB listing based on the features they have. AirBnB might want to implement a feature called suggested price, where the user lists all of the features, such as location, number of rooms, and amount of space for their listing, then the app would recommend them a reasonable price to improve a better environment for both the users and the consumers.
Since we are trying to predict a numerical feature, we are using regressions to predict. We are using Dummy, Ridge, LGBM, and XGBoost to fit our models, and to use the RMSE to determine which has the highest test score. We would expect the ensemble methods to perform better, ie LGBM and XGBoost.
df = pd.read_csv("AB_US_2020.csv", low_memory=False)
df.head()
df_train, df_test = train_test_split(df, random_state=123)
df_train.shape
Exploratory data analysis, including outlier detection.
profile = ProfileReport(df_train)
profile.to_notebook_iframe()
df_train["price"].hist(bins=50)
price_target = df_train["price"]
price_target[price_target > 2500].hist(bins=100)
df_train = df_train[df_train['price'] < 2500]
df_test = df_test[df_test['price'] < 2500]
Discussion: Most of the prices range from 0 to 2500, and there are a 890 outliers that are above 2500. However, the majorities of the outliers lies below 10000, and the amount of listings for 10000 appeared the most compared to other outlier prices. This is probably because users wanted it as 10000 just for fun, and they did not intent to rent their units. On the other hand, the maximum price 24999 also had quite a few occurence. This is probably due to user trying to set their unit prices to the highest price on the market, just to have their listing on the very top.
In the end, we decided that getting rid of the outliers yielded better results.
reviews_per_month = df_train["reviews_per_month"]
reviews_per_month[reviews_per_month > 10].hist(bins=100)
df_train = df_train[df_train['reviews_per_month'] < 20]
df_test = df_test[df_test['reviews_per_month'] < 20]
Discussion: Most of the listings have around 0 to 10 reviews per month, and there are 293 outliers has more than 10 reviews per month. This is probably because these listings are much currently more popular and active than the overall listings on Airbnb.
In the end, we decided that getting rid of the outliers yielded better results.
number_of_reviews = df_train["number_of_reviews"]
number_of_reviews[number_of_reviews > 200].hist(bins=100)
number_of_reviews[number_of_reviews > 200].size
Discussion: Most of the listings have around 0 to 200 reviews in total, and there are 5352 outliers that has more than 200 reviews in total. These outliers are much more popular and so is very acceptable to the consumer than the overall listings, and we should use these listings for our prediction.
# reference to: https://towardsdatascience.com/easy-steps-to-plot-geographic-data-on-a-map-python-11217859a2db
BBox = ((df.longitude.min(), df.longitude.max(), df.latitude.min(), df.latitude.max()))
mmap = plt.imread('./map.png')
fig, ax = plt.subplots(figsize = (15,7))
ax.scatter(df.longitude, df.latitude, c='g', s=3)
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(mmap, extent=BBox)
Discussion: The outliers here are listings that located to a more rural area, and we would not want to look at those listings as it is generally priced differently.
df_train.info()
Discussion: From the dataframe info above, we can see that there are missing values in name, neighbourhood_group, host_name, last_review, reviews_per_month. Specifically, neighbourhood_group has more than half of the entries being missing, we decided to take it out of the feature list.
Discussion: From above, we've noticed that more than half of neighbourhood_group's entries have missing data (ie, NaN). Furthermore, we already have neighbourhood, which can act as a substitute. We also dropped id and host_name since they have are not useful for prediction as they are completely out of context. We dropped last_review because it depends on the time the dataset was gathered.
drop_features = ["neighbourhood_group", "id", "host_name", "last_review"]
df_train_proc = df_train.drop(drop_features, axis=1)
df_test_proc = df_test.drop(drop_features, axis=1)
numeric_features = ["latitude", "longitude", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365"]
categorical_features = ["city", "room_type", "neighbourhood"]
# initialize numeric features pipeline
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# initialize categoric features pipeline
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='?')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# initialize word features pipeline (countvec)
countvec_transformer = Pipeline([
("countvec", CountVectorizer(max_features=1000,
decode_error='ignore',
stop_words=None))
])
# initialize and combine preprocessor pipelines
preprocessor = ColumnTransformer([
('numeric', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features),
('countvec', countvec_transformer, "name")
])
df_train["name"].fillna(' ')
df_test["name"].fillna(' ')
X_train = (df_train_proc.loc[:, df_train_proc.columns != "price"]).astype("U")
y_train = df_train_proc["price"]
X_test = (df_test_proc.loc[:, df_test_proc.columns != "price"]).astype("U")
y_test = df_test_proc["price"]
# ----------------
# dummy model
# ----------------
dummy_pipe = make_pipeline(preprocessor, DummyRegressor())
dummy_pipe.fit(X_train, y_train)
# get predictions
dummy_preds = dummy_pipe.predict(X_train)
# get mean squared error & r2
dummy_rmse_score = np.sqrt(mean_squared_error(y_train, dummy_preds))
dummy_r2_score = r2_score(y_train, dummy_preds)
print("Dummy Scores: RMSE =", dummy_rmse_score, ", r2 =", dummy_r2_score)
# ----------------
# ridge model
# ----------------
ridge_pipe = make_pipeline(preprocessor, Ridge(alpha=100, random_state=123))
ridge_pipe.fit(X_train, y_train)
# get predictions
ridge_preds = ridge_pipe.predict(X_train)
# get mean squared error & r2
ridge_rmse_score = np.sqrt(mean_squared_error(y_train, ridge_preds))
ridge_r2_score = r2_score(y_train, ridge_preds)
print("Ridge Scores: RMSE =", ridge_rmse_score, ", r2 =", ridge_r2_score)
# ----------------
# LGBM model
# ----------------
lgbm_pipe = make_pipeline(preprocessor, LGBMRegressor(random_state=123))
lgbm_pipe.fit(X_train, y_train)
# get predictions
lgbm_preds = lgbm_pipe.predict(X_train)
# get mean squared error & r2
lgbm_rmse_score = np.sqrt(mean_squared_error(y_train, lgbm_preds))
lgbm_r2_score = r2_score(y_train, lgbm_preds)
print("LGBM Scores: RMSE =", lgbm_rmse_score, ", r2 =", lgbm_r2_score)
# ----------------
# XGBoost model
# ----------------
xg_pipe = Pipeline([
('preprocessing', preprocessor),
('classifier', XGBRegressor(random_state=123))
])
xg_pipe.fit(X_train, y_train)
# get predictions
xg_preds = xg_pipe.predict(X_train)
# get mean squared error & r2
xg_rmse_score = np.sqrt(mean_squared_error(y_train, xg_preds))
xg_r2_score = r2_score(y_train, xg_preds)
print("XGBoost Scores: RMSE =", xg_rmse_score, ", r2 =", xg_r2_score)
# print(xg_pipe)
Discussion: From the results above, we see that XGBoost has the best base model. Thus, we will do our hyperparameter tuning based on XGBoost.
# hyperparam options
hypers = {
'classifier__max_depth' : [5,6,7],
'classifier__learning_rate' : [0.3,0.2,0.1,0.01]
}
# initialize grid search with pipe
grid_search_xg = GridSearchCV(xg_pipe, hypers, scoring=make_scorer(r2_score), n_jobs=-1, verbose=5, return_train_score=True, cv=5)
# fit grid search
grid_search_xg.fit(X_train, y_train)
print("Best params: ", grid_search_xg.best_params_)
optimized_xgboost = grid_search_xg.best_estimator_
gs_xg_train_preds = optimized_xgboost.predict(X_train)
xg_rmse_train_score = np.sqrt(mean_squared_error(y_train, gs_xg_train_preds))
xg_r2_train_score = r2_score(y_train, gs_xg_train_preds)
print("XGBoost Train Scores: RMSE =", xg_rmse_train_score, ", r2 =", xg_r2_train_score)
grid_search_xg.cv_results_
columns = [
'mean_test_score', 'mean_train_score', 'mean_fit_time', 'rank_test_score', 'param_classifier__learning_rate', 'param_classifier__max_depth'
]
pd.DataFrame(grid_search_xg.cv_results_)[columns].sort_values(by=['rank_test_score'])
Discussion: From the result, we can see that the model with the highest learning rate, and highest depth had the highest test score of 0.386021.
# get train accuracy
gs_xg_train_preds = optimized_xgboost.predict(X_train)
xg_rmse_train_score = np.sqrt(mean_squared_error(y_train, gs_xg_train_preds))
xg_r2_train_score = r2_score(y_train, gs_xg_train_preds)
print("XGBoost Train Scores: RMSE =", xg_rmse_train_score, ", r2 =", xg_r2_train_score)
# get test accuracy
gs_xg_test_preds = grid_search_xg.predict(X_test)
xg_rmse_test_score = np.sqrt(mean_squared_error(y_test, gs_xg_test_preds))
xg_r2_test_score = r2_score(y_test, gs_xg_test_preds)
print("XGBoost Test Scores: RMSE =", xg_rmse_test_score, ", r2 =", xg_r2_test_score)
Discussion: From the results, we can see that our model has overfitted a little. This is probably because we did not have enough data, and did not have enought features. Features of the listings that would greatly influence prices such as number of bathrooms, year of the building, patio, and bed sizes are not included in the dataset. Therefore, we would not rate this model as an accurate assessment for the pricing of a AirBnB listing.